library(readr)
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ purrr 1.0.2
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
data_replic <- read_csv("/Users/soniadalal/Desktop/302W/302W Project/data/data and replication code/data_replic.csv")
## New names:
## Rows: 1012 Columns: 27
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): iso3, region, income_group dbl (22): year, homrates_unodc, homrates_who,
## deport_convict, deport_convict... lgl (2): ...19, ...23
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...19`
## • `` -> `...23`
head(data_replic)
## # A tibble: 6 × 27
## iso3 year region income_group homrates_unodc homrates_who deport_convict
## <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 AFG 2010 South Asia Low income 3.81 NA 0.0194
## 2 AFG 2012 South Asia Low income 5.41 NA 0.0265
## 3 ALB 1996 Europe & … Upper middl… NA 8.36 0.142
## 4 ALB 1998 Europe & … Upper middl… NA 31.7 0.159
## 5 ALB 2000 Europe & … Upper middl… NA 10.7 0.339
## 6 ALB 2002 Europe & … Upper middl… NA 7.13 0.393
## # ℹ 20 more variables: deport_convict_d <dbl>, deport_convict_lead <dbl>,
## # deport_nonconvict <dbl>, deport_nonconvict_d <dbl>, corruption <dbl>,
## # crime_US_weight <dbl>, GDP_growth <dbl>,
## # GDPpercapita_const2010USD_log <dbl>, gini_mkt <dbl>,
## # instr_benef_medicaidpregn_lag1 <dbl>, instr_enforce_everify_lag1 <dbl>,
## # ...19 <lgl>, polity2 <dbl>, pop_sh14 <dbl>, population_log <dbl>,
## # ...23 <lgl>, remittances_GDP <dbl>, urban_interp <dbl>, …
summary(data_replic)
## iso3 year region income_group
## Length:1012 Min. :1996 Length:1012 Length:1012
## Class :character 1st Qu.:2002 Class :character Class :character
## Mode :character Median :2006 Mode :character Mode :character
## Mean :2006
## 3rd Qu.:2010
## Max. :2014
##
## homrates_unodc homrates_who deport_convict deport_convict_d
## Min. : 0.280 Min. : 0.000 Min. : 0.00000 Min. :-164.00466
## 1st Qu.: 1.394 1st Qu.: 1.092 1st Qu.: 0.04974 1st Qu.: -0.04479
## Median : 3.390 Median : 2.805 Median : 0.17095 Median : 0.00127
## Mean : 8.731 Mean : 7.190 Mean : 6.02820 Mean : 0.34473
## 3rd Qu.: 9.279 3rd Qu.: 9.119 3rd Qu.: 1.14709 3rd Qu.: 0.07173
## Max. :92.960 Max. :74.846 Max. :175.44412 Max. : 57.59695
## NA's :398 NA's :208 NA's :9 NA's :53
## deport_convict_lead deport_nonconvict deport_nonconvict_d corruption
## Min. :-164.00466 Min. : 0.000 Min. :-117.9508 Min. :0.000
## 1st Qu.: -0.05009 1st Qu.: 0.120 1st Qu.: -0.1497 1st Qu.:2.000
## Median : 0.00000 Median : 0.392 Median : 0.0000 Median :2.500
## Mean : 0.19557 Mean : 7.350 Mean : 0.9367 Mean :2.981
## 3rd Qu.: 0.07078 3rd Qu.: 2.470 3rd Qu.: 0.1312 3rd Qu.:4.000
## Max. : 57.59695 Max. :307.606 Max. : 196.3743 Max. :6.000
## NA's :132 NA's :143 NA's :155 NA's :204
## crime_US_weight GDP_growth GDPpercapita_const2010USD_log
## Min. :341.5 Min. :-16.700 Min. : 5.755
## 1st Qu.:407.8 1st Qu.: 1.730 1st Qu.: 8.082
## Median :466.6 Median : 3.852 Median : 8.915
## Mean :464.1 Mean : 3.900 Mean : 8.949
## 3rd Qu.:506.0 3rd Qu.: 5.970 3rd Qu.:10.056
## Max. :724.9 Max. : 34.500 Max. :11.485
## NA's :234 NA's :12 NA's :16
## gini_mkt instr_benef_medicaidpregn_lag1 instr_enforce_everify_lag1
## Min. :27.90 Min. :0.1484 Min. :0.00000
## 1st Qu.:43.00 1st Qu.:0.6028 1st Qu.:0.00000
## Median :46.90 Median :0.7047 Median :0.03464
## Mean :46.53 Mean :0.6826 Mean :0.07168
## 3rd Qu.:50.10 3rd Qu.:0.7784 3rd Qu.:0.09822
## Max. :68.50 Max. :0.9633 Max. :0.79440
## NA's :132 NA's :313 NA's :313
## ...19 polity2 pop_sh14 population_log
## Mode:logical Min. :-9.000 Min. :11.06 Min. : 9.678
## NA's:1012 1st Qu.: 5.000 1st Qu.:17.57 1st Qu.:15.036
## Median : 8.000 Median :24.05 Median :15.958
## Mean : 5.822 Mean :25.39 Mean :15.819
## 3rd Qu.:10.000 3rd Qu.:31.72 3rd Qu.:17.229
## Max. :10.000 Max. :49.87 Max. :21.034
## NA's :153 NA's :37
## ...23 remittances_GDP urban_interp war_intrastate
## Mode:logical Min. : 0.00139 Min. : 10.30 Min. :0.000000
## NA's:1012 1st Qu.: 0.29699 1st Qu.: 49.17 1st Qu.:0.000000
## Median : 1.39941 Median : 64.65 Median :0.000000
## Mean : 3.95100 Mean : 62.13 Mean :0.006917
## 3rd Qu.: 4.83025 3rd Qu.: 77.05 3rd Qu.:0.000000
## Max. :49.28990 Max. :100.00 Max. :1.000000
## NA's :86
## yr_sch
## Min. : 2.870
## 1st Qu.: 7.432
## Median : 9.399
## Mean : 9.029
## 3rd Qu.:10.810
## Max. :16.815
## NA's :183
df <- data_replic %>%
select(-...19, -...23)
newdf <- data_replic %>%
select(-...19, -...23) %>%
filter(!is.na(deport_nonconvict) & !is.na(homrates_unodc) & !is.na(region) & !is.na(deport_convict) & !is.na(income_group))
newdf
## # A tibble: 537 × 25
## iso3 year region income_group homrates_unodc homrates_who deport_convict
## <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 AFG 2010 South As… Low income 3.81 NA 0.0194
## 2 ALB 2006 Europe &… Upper middl… 4.06 0 1.07
## 3 ALB 2008 Europe &… Upper middl… 3.31 2.87 0.964
## 4 ALB 2010 Europe &… Upper middl… 3.64 1.15 1.59
## 5 ALB 2012 Europe &… Upper middl… 5.18 NA 1.27
## 6 ALB 2014 Europe &… Upper middl… 4.18 NA 0.916
## 7 ARM 2006 Europe &… Lower middl… 2.28 NA 0.993
## 8 ARM 2008 Europe &… Lower middl… 2.92 NA 1.32
## 9 ARM 2010 Europe &… Lower middl… 2.58 1.70 1.04
## 10 ARM 2012 Europe &… Lower middl… 2.28 1.61 1.18
## # ℹ 527 more rows
## # ℹ 18 more variables: deport_convict_d <dbl>, deport_convict_lead <dbl>,
## # deport_nonconvict <dbl>, deport_nonconvict_d <dbl>, corruption <dbl>,
## # crime_US_weight <dbl>, GDP_growth <dbl>,
## # GDPpercapita_const2010USD_log <dbl>, gini_mkt <dbl>,
## # instr_benef_medicaidpregn_lag1 <dbl>, instr_enforce_everify_lag1 <dbl>,
## # polity2 <dbl>, pop_sh14 <dbl>, population_log <dbl>, …
#Make a correlation plot
install.packages("corrplot")
## The following package(s) will be installed:
## - corrplot [0.92]
## These packages will be installed into "~/Desktop/302W/302W Project/data/data and replication code/renv/library/R-4.2/x86_64-apple-darwin17.0".
##
## # Installing packages --------------------------------------------------------
## - Installing corrplot ... OK [linked from cache]
## Successfully installed 1 package in 11 milliseconds.
library(corrplot)
## corrplot 0.92 loaded
df <- df %>%
mutate(
income_group_numeric = case_when(
income_group == "Low income" ~ 1,
income_group == "Lower middle income" ~ 2,
income_group == "Upper middle income" ~ 3,
income_group == "High income: nonOECD" ~ 4,
income_group == "High income: OECD" ~ 5,
TRUE ~ NA_real_ # Handles any cases that don't match the above
)
)
# Select relevant variables and remove NA values for correlation analysis
cor_data <- df %>%
select(corruption, homrates_unodc, GDPpercapita_const2010USD_log, urban_interp, year, deport_convict, deport_nonconvict, polity2, yr_sch, income_group_numeric) %>%
na.omit() # Remove rows with NA values
# Calculate the correlation matrix
cor_matrix <- cor(cor_data)
corrplot(cor_matrix, method = "color", type = "upper", order = "hclust",
addCoef.col = "black", # Add correlation coefficients to enhance readability
tl.col = "black", tl.srt = 45, # Adjust text label color and rotation for better visibility
diag = FALSE, # Exclude diagonal elements to avoid redundancy
number.cex = 0.8, # Adjust size of correlation coefficients
tl.cex = 0.8)
Shows that as deportation increases, homicide rates increase but over
the years available in the data, homicide rates and deportations rates
did not increase significantly.
#Income distribution Counts in dataset for each region
# Bar plot
ggplot(df, aes(x = region, fill = income_group)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Income Distribution Counts for Each Region")
More data on income group is available for Latin America & Caribbean
and Europe and Central Asia than other regions. #Convict Deportation
distribution in dataset by Region
# Bar plot
ggplot(df, aes(x = region, fill = deport_convict)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Convict Deportation Rate Distribution Counts per Region")
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
More data on convict deportation is available for Latin America &
Caribbean and Europe and Central Asia than other regions. #Non-Convict
Deportation distribution in dataset by Region
# Bar plot
ggplot(df, aes(x = region, fill = deport_nonconvict)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Non-Convict Deportation Rate Distribution Counts per Region")
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
More data on deportation of nonconvicts is available for Latin America & Caribbean and Europe and Central Asia than other regions. #Corruption distribution in dataset by Region
# Bar plot
ggplot(df, aes(x = region, fill = corruption)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Corruption Score Distribution Counts per Region")
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
More data on corruption is available for Latin America & Caribbean
and Europe and Central Asia than other regions. # Boxplot of Corruption
Scores for all the Regions
# Filter the dataset to include only relevant variables
corruption_data <- data_replic %>%
select(region, corruption)
# Summary statistics by region
summary_stats <- corruption_data %>%
group_by(region) %>%
summarise(mean_corruption = mean(corruption, na.rm = TRUE),
median_corruption = median(corruption, na.rm = TRUE),
min_corruption = min(corruption, na.rm = TRUE),
max_corruption = max(corruption, na.rm = TRUE))
# Visualization: Box plot of corruption scores by region
boxplot_corruption <- ggplot(corruption_data, aes(x = region, y = corruption, fill = region)) +
geom_boxplot() +
labs(title = "Corruption Scores by Region",
x = "Region",
y = "Corruption Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Display the summary statistics and box plot
print(summary_stats)
## # A tibble: 8 × 5
## region mean_corruption median_corruption min_corruption max_corruption
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 East Asia & P… 3.24 3 1 5.5
## 2 Europe & Cent… 3.5 3 1 6
## 3 Latin America… 2.49 2.5 1 5
## 4 Middle East &… 2.39 2 1 5
## 5 North America 4.88 5 3.5 6
## 6 South Asia 2.48 2.5 1 4
## 7 Sub-Saharan A… 1.93 2 0 3
## 8 <NA> 2.5 2.5 2 3
print(boxplot_corruption)
## Warning: Removed 204 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Latin America & Caribbean have more corruption on average than Europe & Central Asia #Income distribution proportions for each region in data with no NA for deportation or homicides.
# Calculate the count per income group within each region
newdf_with_count <- newdf %>%
group_by(region, income_group) %>%
summarise(count = n(), .groups = 'drop')
new
## function (Class, ...)
## {
## ClassDef <- getClass(Class, where = topenv(parent.frame()))
## value <- .Call(C_new_object, ClassDef)
## initialize(value, ...)
## }
## <bytecode: 0x7f7ffc275170>
## <environment: namespace:methods>
# Calculate the total count per region
totals <- newdf_with_count %>%
group_by(region) %>%
summarise(total = sum(count), .groups = 'drop')
# Join the totals back to the original data frame
newdf_with_proportions <- newdf_with_count %>%
left_join(totals, by = "region") %>%
mutate(proportion = count / total)
# Plot the proportions instead of count
ggplot(newdf_with_proportions, aes(x = region, y = proportion, fill = income_group)) +
geom_bar(position = "dodge", stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylab("Proportion")
Europe & Central Asia has a higher proportion of High income whereas
Latin America & Caribbean has a higher proportion of Upper middle
income.
#Income distribution proportions for each region with darker colors corresponding to lower income
# Order the income_group factor based on income levels from highest to lowest
newdf_with_proportions$income_group <- factor(
newdf_with_proportions$income_group,
levels = c(
"Low income",
"Lower middle income",
"Upper middle income",
"High income: nonOECD",
"High income: OECD"
),
ordered = TRUE
)
ggplot(newdf_with_proportions, aes(x = region, y = proportion, fill = income_group)) +
geom_bar(stat = "identity", position = position_dodge(preserve = 'single')) +
scale_fill_brewer(palette = "Blues", direction = -1) + # Use reversed Blues palette for darker colors on lower incomes
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y = "Proportion", fill = "Income Group")
# Correlational plot with Convict Deportation Rates and Homicide Rates
by Region (Scaled and Unscaled)
ggplot(newdf, aes(x = deport_convict, y = homrates_unodc)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region, scale = "free") +
labs(title = "Correlation Between Convict Deportation Rates and Homicide Rates by Region",
x = "Convict Deportation Rate",
y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(newdf, aes(x = deport_convict, y = homrates_unodc)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region) +
labs(title = "Correlation Between Convict Deportation Rates and Homicide Rates by Region",
x = "Non-Convict Deportation Rate",
y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'
correlation_by_region <- newdf %>%
group_by(region) %>%
summarize(correlation = cor(deport_convict, homrates_unodc, use = "complete.obs"))
print(correlation_by_region)
## # A tibble: 7 × 2
## region correlation
## <chr> <dbl>
## 1 East Asia & Pacific 0.101
## 2 Europe & Central Asia 0.0529
## 3 Latin America & Caribbean 0.616
## 4 Middle East & North Africa -0.0952
## 5 North America 0.996
## 6 South Asia 0.493
## 7 Sub-Saharan Africa -0.0725
ggplot(newdf, aes(x = deport_nonconvict, y = homrates_unodc)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region, scale = "free") +
labs(title = "Correlation Between Non-Convict Deportation Rates and Homicide Rates by Region",
x = "Non-Convict Deportation Rate",
y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(newdf, aes(x = deport_nonconvict, y = homrates_unodc)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region) +
labs(title = "Correlation Between Non-Convict Deportation Rates and Homicide Rates by Region",
x = "Non-Convict Deportation Rate",
y = "UNODC Homicide Rate")
## `geom_smooth()` using formula = 'y ~ x'
correlation_by_region2 <- newdf %>%
group_by(region) %>%
summarize(correlation = cor(deport_nonconvict, homrates_unodc, use = "complete.obs"))
print(correlation_by_region2)
## # A tibble: 7 × 2
## region correlation
## <chr> <dbl>
## 1 East Asia & Pacific 0.158
## 2 Europe & Central Asia 0.132
## 3 Latin America & Caribbean 0.576
## 4 Middle East & North Africa -0.142
## 5 North America 0.890
## 6 South Asia 0.364
## 7 Sub-Saharan Africa -0.178
These correlational plots show the most comprehensive data for non-convicts and convicts for Latin American and the Carribean. For this reason we will forcus on this region. # Distribution of Years in the Dataset
# Preparing the data by ensuring 'year' is available and correctly formatted
newdf <- mutate(newdf, year = as.integer(year))
# Combine into one dataframe for plotting
# Plotting the histogram of year distributions across datasets
ggplot(newdf, aes(x = year)) +
geom_histogram(position = "identity", alpha = 0.5, binwidth = 1) +
labs(title = "Distribution of Years", x = "Year", y = "Count") +
scale_fill_manual(values = c("blue", "red", "green")) +
theme_minimal()
Data only available from 2004 to 2014. This plus the correlation plot
shows a limitation that there hasn’t been much change in the variables
which may make it more difficult to find relationships in this time.
# Filter the dataset for Latin America & Caribbean region
latin_america_data <- newdf %>%
filter(region == "Latin America & Caribbean")
# Impute missing data for odd years using linear interpolation
imputed_data1 <- latin_america_data %>%
tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
# Create the line plot
ggplot(imputed_data1, aes(x = year, y = homrates_unodc)) +
geom_line() +
labs(title = "Homicide Rates in Latin America & Caribbean",
x = "Year",
y = "Homicide Rates (per 100,000 population)") +
theme_minimal()
# Filter the dataset for Latin America & Caribbean region
europe_central_asia_data <- newdf %>%
filter(region == "Europe & Central Asia")
# Impute missing data for odd years using linear interpolation
imputed_data2 <- europe_central_asia_data %>%
tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
# Create the line plot
ggplot(imputed_data2, aes(x = year, y = homrates_unodc)) +
geom_line() +
labs(title = "Homicide Rates in Europe and Central Asia",
x = "Year",
y = "Homicide Rates (per 100,000 population)") +
theme_minimal()
combined_data <- rbind(imputed_data1, imputed_data2)
ggplot(combined_data, aes(x = year, y = homrates_unodc, color = region)) +
geom_line() +
labs(title = "Homicide Rates in Latin America & Caribbean vs. Europe & Central Asia",
x = "Year",
y = "Homicide Rates (per 100,000 population)") +
theme_minimal() +
scale_color_manual(values = c("blue", "red"))
Europe and Central Asia Homicide rates are low and slightly decrease
over the years while homicide rates in Latin America and the Caribbean
are much higher and increase then decrease. # Make new dataset with just
Latin America and the Caribbean
# Filter the dataset for Latin America & Caribbean region
latin_america_data <- newdf %>%
filter(region == "Latin America & Caribbean")
# Impute missing data for odd years using linear interpolation
imputed_data <- latin_america_data %>%
tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y,
deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Create the line plot
ggplot(imputed_data, aes(x = year, y = homrates_unodc)) +
geom_line(aes(color = "Homicide Rates"), size = 1) +
geom_line(aes(y = deport_convict, color = "Convict Deportation Rates"), linetype = "dashed", size = 1) +
geom_line(aes(y = deport_nonconvict, color = "Non-Convict Deportation Rates"), linetype = "dotdash", size = 1) +
labs(title = "Homicide and Deportation Rates in Latin America & Caribbean",
x = "Year",
y = "Rates per 100,000 population",
color = "Legend") +
scale_color_manual(values = c("Homicide Rates" = "red",
"Convict Deportation Rates" = "blue",
"Non-Convict Deportation Rates" = "green")) +
theme_minimal() +
theme(legend.title = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Filter data for Europe & Central Asia
europe_data <- newdf %>%
filter(region == "Europe & Central Asia")
# Impute missing data for odd years using linear interpolation
imputed_europe_data <- europe_data %>%
tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
mutate(homrates_unodc = approx(year[!is.na(homrates_unodc)], homrates_unodc[!is.na(homrates_unodc)], year)$y,
deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `homrates_unodc = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Create the plot for Europe & Central Asia
plot_europe <- ggplot(imputed_europe_data) +
geom_line(aes(x = year, y = homrates_unodc, color = "Homicide Rates"), size = 1) +
geom_line(aes(x = year, y = deport_convict, color = "Convict Deportation Rates"), size = 1, linetype = "dashed") +
geom_line(aes(x = year, y = deport_nonconvict, color = "Non-Convict Deportation Rates"), size = 1, linetype = "dotdash") +
labs(title = "Homicide and Deportation Rates in Europe & Central Asia",
x = "Year",
y = "Rates per 100,000 population",
color = "Legend") +
scale_color_manual(values = c("Homicide Rates" = "red",
"Convict Deportation Rates" = "blue",
"Non-Convict Deportation Rates" = "green")) +
theme_minimal() +
theme(legend.title = element_blank())
# Display the plot for Europe & Central Asia
plot_europe
latin_america_tot <- df %>%
filter(region == "Latin America & Caribbean")
imputed_latin_america_tot <- latin_america_tot %>%
tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
mutate(deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `deport_convict = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(imputed_latin_america_tot) +
geom_line(aes(x = year, y = deport_convict, color = "Convict Deportation Rates"), size = 1) +
geom_line(aes(x = year, y = deport_nonconvict, color = "Non-Convict Deportation Rates"), size = 1) +
labs(title = "Deportation Rates in Latin America & Caribbean",
x = "Year",
y = "Rates per 100,000 population",
color = "Legend") +
scale_color_manual(values = c("Convict Deportation Rates" = "blue",
"Non-Convict Deportation Rates" = "green")) +
theme_minimal() +
theme(legend.title = element_blank())
europe_tot <- df %>%
filter(region == "Europe & Central Asia")
imputed_europe_tot <- europe_tot %>%
tidyr::complete(year = seq(min(year), max(year), by = 2)) %>%
mutate(deport_convict = approx(year[!is.na(deport_convict)], deport_convict[!is.na(deport_convict)], year)$y,
deport_nonconvict = approx(year[!is.na(deport_nonconvict)], deport_nonconvict[!is.na(deport_nonconvict)], year)$y)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `deport_convict = `$`(...)`.
## Caused by warning in `regularize.values()`:
## ! collapsing to unique 'x' values
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(imputed_europe_tot) +
geom_line(aes(x = year, y = deport_convict, color = "Convict Deportation Rates"), size = 1) +
geom_line(aes(x = year, y = deport_nonconvict, color = "Non-Convict Deportation Rates"), size = 1) +
labs(title = "Deportation Rates in Latin America & Caribbean",
x = "Year",
y = "Rates per 100,000 population",
color = "Legend") +
scale_color_manual(values = c("Convict Deportation Rates" = "blue",
"Non-Convict Deportation Rates" = "green")) +
theme_minimal() +
theme(legend.title = element_blank())
# Filter the dataset to include only relevant variables and regions
region_data <- data_replic %>%
filter(region %in% c("Latin America & Caribbean", "Europe & Central Asia")) %>%
select(region, corruption, homrates_unodc)
# Visualization: Scatter plot of corruption scores vs. homicide rates by region
scatter_plot <- ggplot(region_data, aes(x = corruption, y = homrates_unodc, color = region)) +
geom_point() +
labs(title = "Corruption Scores vs. Homicide Rates",
x = "Corruption Score",
y = "Homicide Rate",
color = "Region") +
theme_minimal()
# Display the scatter plot
print(scatter_plot)
## Warning: Removed 370 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(newdf, aes(x = corruption, y = deport_convict)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region, scale = "free") +
labs(title = "Correlation Between Corruption Scores and Convict Deportation Rates by Region",
x = "Corruption Score",
y = "Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(newdf, aes(x = corruption, y = deport_convict)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region) +
labs(title = "Correlation Between Corruption Scores and Convict Deportation Rates by Region",
x = "Corruption Score",
y = "Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(newdf, aes(x = corruption, y = deport_nonconvict)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region, scale = "free") +
labs(title = "Corruption Scores & Non-Convict Deportation Rates by Region",
x = "Corruption Score",
y = "Non-Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(newdf, aes(x = corruption, y = deport_nonconvict)) +
geom_point(aes(color = region), alpha = 0.5) +
geom_smooth(method = "lm") +
facet_wrap(~ region) +
labs(title = "Corruption Scores & Non-Convict Deportation Rates by Region",
x = "Corruption Score",
y = "Non-Convict Deportation Rate")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 76 rows containing missing values or values outside the scale range
## (`geom_point()`).
mean_rates <- newdf %>%
group_by(region) %>%
summarize(mean_homicide_rate = mean(homrates_unodc, na.rm = TRUE),
mean_deport_convict = mean(deport_convict, na.rm = TRUE),
mean_deport_nonconvict = mean(deport_nonconvict, na.rm = TRUE))
ggplot(mean_rates, aes(x = region)) +
geom_bar(aes(y = mean_homicide_rate), stat = "identity", position = "dodge", fill = "blue") +
labs(title = "Mean Homicide Rates by Region (2004-2014)",
x = "Region",
y = "Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(mean_rates, aes(x = region)) +
geom_bar(aes(y = mean_deport_convict), stat = "identity", position = "dodge", fill = "red") +
labs(title = "Mean Convict Deportation Rates by Region (2004-2014)",
x = "Region",
y = "Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(mean_rates, aes(x = region)) +
geom_bar(aes(y = mean_deport_nonconvict), stat = "identity", position = "dodge", fill = "green") +
labs(title = "Mean Non-Convict Rates by Region (2004-2014)",
x = "Region",
y = "Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))